
clear all
set more off

global dir /Volumes/Zihao_SSD2/PatentsView

*** This part cleans and extracts attorneys' firstname and lastname
*** Zihao Li. 06/2024
import delimited $dir/rawdata/g_attorney_disambiguated.tsv, clear 

rename (disambig_attorney_name_first disambig_attorney_name_last) (first_name last_name)
format %30s first_name last_name disambig_attorney_organization
sort first_name
drop if first_name == ""
replace first_name = strltrim(first_name)
replace first_name = strrtrim(first_name)
replace last_name = strltrim(last_name)
replace last_name = strrtrim(last_name)

* Clean suffixes
replace last_name = subinstr(last_name, ", Jr.", "", .)
replace last_name = subinstr(last_name, ", Jr,", "", .)
replace last_name = subinstr(last_name, ", Jr", "", .)
replace last_name = subinstr(last_name, ". Jr.", "", .)
replace last_name = subinstr(last_name, ". Jr", "", .)
replace last_name = subinstr(last_name, " Jr.", "", .)
replace last_name = substr(last_name, 1, length(last_name)-3) if substr(last_name, -3, .) == " Jr"
replace last_name = subinstr(last_name, ", JR.", "", .)
replace last_name = subinstr(last_name, ", V", "", .)
replace last_name = subinstr(last_name, ", IV.", "", .)
replace last_name = subinstr(last_name, ", IV", "", .)
replace last_name = subinstr(last_name, " IV", "", .)
replace last_name = subinstr(last_name, ", III.", "", .)
replace last_name = subinstr(last_name, ", III", "", .)
replace last_name = subinstr(last_name, ",III", "", .)
replace last_name = subinstr(last_name, " III", "", .)
replace last_name = subinstr(last_name, ", II.", "", .)
replace last_name = subinstr(last_name, ", II", "", .)
replace last_name = subinstr(last_name, ",II", "", .)
replace last_name = subinstr(last_name, " II", "", .)
replace last_name = subinstr(last_name, ", I", "", .)
replace last_name = subinstr(last_name, ", Sr.", "", .)
replace last_name = subinstr(last_name, ", SR.", "", .)
replace last_name = subinstr(last_name, "0'", "O'", .)

gen attorney_name = first_name + " " + last_name

* Clean special characters (this is a comprehensive list)
replace attorney_name = subinstr(attorney_name, "?", "", .)
replace attorney_name = subinstr(attorney_name, "!", "", .)
replace attorney_name = subinstr(attorney_name, "(", "", .)
replace attorney_name = subinstr(attorney_name, ")", "", .)
replace attorney_name = subinstr(attorney_name, "{", "", .)
replace attorney_name = subinstr(attorney_name, "}", "", .)
replace attorney_name = subinstr(attorney_name, "[", "", .)
replace attorney_name = subinstr(attorney_name, "]", "", .)
replace attorney_name = subinstr(attorney_name, "/ ", "", .)
replace attorney_name = subinstr(attorney_name, "/", "", .)
replace attorney_name = subinstr(attorney_name, "'", "", .)
replace attorney_name = subinstr(attorney_name, `"""', "", .)
replace attorney_name = subinstr(attorney_name, ";", "", .)
replace attorney_name = subinstr(attorney_name, "¸", "", .)
replace attorney_name = subinstr(attorney_name, "¥", "", .)
replace attorney_name = subinstr(attorney_name, "¶", "", .)
replace attorney_name = subinstr(attorney_name, "¨", "", .)
replace attorney_name = subinstr(attorney_name, "¤", "", .)
replace attorney_name = subinstr(attorney_name, "±", "", .)
replace attorney_name = subinstr(attorney_name, "&", "", .)
replace attorney_name = subinstr(attorney_name, "§a", "", .)
replace attorney_name = subinstr(attorney_name, "A", "Anel", .)

replace attorney_name = subinstr(attorney_name, "å", "a", .)
replace attorney_name = subinstr(attorney_name, "á", "a", .)
replace attorney_name = subinstr(attorney_name, "à", "a", .)
replace attorney_name = subinstr(attorney_name, "ä", "a", .)
replace attorney_name = subinstr(attorney_name, "æ", "ae", .)
replace attorney_name = subinstr(attorney_name, "ã", "a", .)
replace attorney_name = subinstr(attorney_name, "â", "a", .)
replace attorney_name = subinstr(attorney_name, "a¹", "a", .)
replace attorney_name = subinstr(attorney_name, "Á", "A", .)
replace attorney_name = subinstr(attorney_name, "Å", "A", .)
replace attorney_name = subinstr(attorney_name, "Ä°", "A", .)
replace attorney_name = subinstr(attorney_name, "Ä", "A", .)
replace attorney_name = subinstr(attorney_name, "Ã", "A", .)
replace attorney_name = subinstr(attorney_name, "Ā", "A", .)
replace attorney_name = subinstr(attorney_name, "Ã", "A", .)
replace attorney_name = subinstr(attorney_name, "Â", "A", .)
replace attorney_name = subinstr(attorney_name, "Ã³", "A", .)
replace attorney_name = subinstr(attorney_name, "A³", "A", .)
replace attorney_name = subinstr(attorney_name, "Ã¼", "A", .)
replace attorney_name = subinstr(attorney_name, "A¼", "A", .)


replace attorney_name = subinstr(attorney_name, "ß", "b", .)

replace attorney_name = subinstr(attorney_name, "č", "c", .)
replace attorney_name = subinstr(attorney_name, "ć", "c", .)
replace attorney_name = subinstr(attorney_name, "c̆", "c", .)
replace attorney_name = subinstr(attorney_name, "Ç", "C", .)
replace attorney_name = subinstr(attorney_name, "ç", "c", .)
replace attorney_name = subinstr(attorney_name, "¢", "c", .)
replace attorney_name = subinstr(attorney_name, "Č", "C", .)
replace attorney_name = subinstr(attorney_name, "©", "c", .)
replace attorney_name = subinstr(attorney_name, "Ç", "C", .)

replace attorney_name = subinstr(attorney_name, "É", "E", .)
replace attorney_name = subinstr(attorney_name, "È", "E", .)
replace attorney_name = subinstr(attorney_name, "é", "e", .)
replace attorney_name = subinstr(attorney_name, "è", "e", .)
replace attorney_name = subinstr(attorney_name, "ě", "e", .)
replace attorney_name = subinstr(attorney_name, "ë", "e", .)
replace attorney_name = subinstr(attorney_name, "ȩ", "e", .)

replace attorney_name = subinstr(attorney_name, "ǧ", "g", .)
replace attorney_name = subinstr(attorney_name, "ğ", "g", .)
replace attorney_name = subinstr(attorney_name, "ǧ̃", "g", .)
replace attorney_name = subinstr(attorney_name, "g̃", "g", .)
replace attorney_name = subinstr(attorney_name, "gˇ", "g", .)

replace attorney_name = subinstr(attorney_name, "İ", "I", .)
replace attorney_name = subinstr(attorney_name, "Î", "I", .)
replace attorney_name = subinstr(attorney_name, "Í", "I", .)
replace attorney_name = subinstr(attorney_name, "Ì", "I", .)
replace attorney_name = subinstr(attorney_name, "i̇", "i", .)
replace attorney_name = subinstr(attorney_name, "ı̈", "i", .)
replace attorney_name = subinstr(attorney_name, "ï", "i", .)
replace attorney_name = subinstr(attorney_name, "ı¨", "i", .)
replace attorney_name = subinstr(attorney_name, "ı́", "i", .)
replace attorney_name = subinstr(attorney_name, "í", "i", .)
replace attorney_name = subinstr(attorney_name, "ĭ", "i", .)
replace attorney_name = subinstr(attorney_name, "iˇ", "i", .)
replace attorney_name = subinstr(attorney_name, "iˆ", "i", .)
replace attorney_name = subinstr(attorney_name, "ı", "i", .)
replace attorney_name = subinstr(attorney_name, "î", "i", .)
replace attorney_name = subinstr(attorney_name, "¡", "i", .)

replace attorney_name = subinstr(attorney_name, "Ł", "L", .)
replace attorney_name = subinstr(attorney_name, "Ľ", "L", .)
replace attorney_name = subinstr(attorney_name, "ł", "l", .)

replace attorney_name = subinstr(attorney_name, "Ñ", "N", .)
replace attorney_name = subinstr(attorney_name, "ń", "n", .)
replace attorney_name = subinstr(attorney_name, "ñ", "n", .)

replace attorney_name = subinstr(attorney_name, "Ø", "O", .)
replace attorney_name = subinstr(attorney_name, "Ó", "O", .)
replace attorney_name = subinstr(attorney_name, "Ö", "O", .)
replace attorney_name = subinstr(attorney_name, "Ò", "O", .)
replace attorney_name = subinstr(attorney_name, "Ō", "O", .)
replace attorney_name = subinstr(attorney_name, "ö", "o", .)
replace attorney_name = subinstr(attorney_name, "Ó", "O", .)
replace attorney_name = subinstr(attorney_name, "ó", "o", .)
replace attorney_name = subinstr(attorney_name, "ò", "o", .)
replace attorney_name = subinstr(attorney_name, "ø", "o", .)
replace attorney_name = subinstr(attorney_name, "ô", "o", .)
replace attorney_name = subinstr(attorney_name, "ő", "o", .)
replace attorney_name = subinstr(attorney_name, "œ", "oe", .)

replace attorney_name = subinstr(attorney_name, "ř", "r", .)
replace attorney_name = subinstr(attorney_name, "š", "s", .)
replace attorney_name = subinstr(attorney_name, "ş", "s", .)
replace attorney_name = subinstr(attorney_name, "Š", "S", .)
replace attorney_name = subinstr(attorney_name, "Ş", "S", .)
replace attorney_name = subinstr(attorney_name, "Ś", "S", .)

replace attorney_name = subinstr(attorney_name, "ü", "u", .)
replace attorney_name = subinstr(attorney_name, "ú", "u", .)
replace attorney_name = subinstr(attorney_name, "ü", "u", .)
replace attorney_name = subinstr(attorney_name, "Ü", "U", .)
replace attorney_name = subinstr(attorney_name, "Ú", "U", .)
replace attorney_name = subinstr(attorney_name, "ý", "y", .)
replace attorney_name = subinstr(attorney_name, "Ž", "Z", .)
replace attorney_name = subinstr(attorney_name, "ž", "z", .)


** Extract examiner FirstName
replace attorney_name = strltrim(attorney_name)
replace attorney_name = strrtrim(attorney_name)
split attorney_name, parse(" ")

* Add spaces after dots if they aren't there already. Add dots to middle initials.
forval y = 1/14 {
	replace attorney_name`y' = subinstr(attorney_name`y', ".", ". ", .)
	replace attorney_name`y' = strrtrim(attorney_name`y')
	replace attorney_name`y' = attorney_name`y' + "." if ustrregexm(attorney_name`y', "[A-Z]$") & ustrlen(attorney_name`y') == 1
}

* Reshape into a full_name format
sort attorney_name
replace attorney_name = attorney_name1
forval y = 2/14 {
	replace attorney_name = attorney_name + " " + attorney_name`y' if !missing(attorney_name`y')
}
drop attorney_name1 attorney_name2 attorney_name3 attorney_name4 attorney_name5 attorney_name6 attorney_name7 attorney_name8 attorney_name9 attorney_name10 attorney_name11 attorney_name12 attorney_name13 attorney_name14

replace attorney_name = proper(attorney_name)
replace attorney_name = substr(attorney_name, 3, .) if substr(attorney_name, 1, 2) == ". "

* Change x. xxx xxx into xxx x. xxx
split attorney_name, parse(" ")
gen attorney_namenew1 = attorney_name1

replace attorney_namenew1 = attorney_name2 if ustrregexm(attorney_name1, "^[A-Z]\.")==1 & ustrregexm(attorney_name2, "[A-Za-z]\.") == 0 & lower(attorney_name2) != "de" & lower(attorney_name2) != "van" & lower(attorney_name2) != "von" & lower(attorney_name2) != "la" & attorney_name3 != ""

replace attorney_name2 = attorney_name1 if attorney_name1 != attorney_namenew1
gen attorney_namenew = attorney_namenew1
forval y = 2/14 {
	replace attorney_namenew = attorney_namenew + " " + attorney_name`y' if !missing(attorney_name`y')
}

order patent_id attorney_name attorney_namenew attorney_namenew1
drop attorney_name
rename attorney_namenew attorney_name

* Generate last name (last non-missing entry)
gen last_name_clean = ""
forval y = 1/14 {
	replace last_name_clean = attorney_name`y' if !missing(attorney_name`y') 
}
* Generate first name
rename attorney_namenew1 first_name_clean
format %30s attorney_name
drop attorney_name1 attorney_name2 attorney_name3 attorney_name4 attorney_name5 attorney_name6 attorney_name7 attorney_name8 attorney_name9 attorney_name10 attorney_name11 attorney_name12 attorney_name13 attorney_name14

order patent_id attorney_name first_name first_name_clean last_name last_name_clean

replace attorney_name = substr(attorney_name, 2, .) if substr(attorney_name, 1, 1) == "-"
replace first_name_clean = substr(first_name_clean, 2, .) if substr(first_name_clean, 1, 1) == "-"
replace last_name_clean = substr(last_name_clean, 2, .) if substr(last_name_clean, 1, 1) == "-"

* Individual fixes for last names
replace last_name_clean = "Gonzalez" if strpos(last_name, "Gonzalez") & length(last_name_clean) == 2
replace last_name_clean = "Dieu" if strpos(last_name, "Dieu") & length(last_name_clean) == 2
replace last_name_clean = "Krawczewicz" if strpos(last_name, "Krawczewicz") & length(last_name_clean) == 2
replace last_name_clean = "Ngathi" if strpos(last_name, "Ngathi") & length(last_name_clean) == 2
replace last_name_clean = "Tran" if strpos(last_name, "Tran") & length(last_name_clean) == 2
replace last_name_clean = "Yu" if strpos(last_name, "Yu") & length(last_name_clean) == 2

gen attorney_name_nomid = first_name_clean + " " + last_name_clean
order attorney_name_nomid, after(attorney_name)
order attorney_sequence, after(attorney_name_nomid)
gsort patent_id -attorney_sequence


* Export dataset
replace first_name_clean = lower(first_name_clean)
save $dir/temp/g_attorney_clean.dta, replace
// export delimited using $dir/temp/g_attorney_clean_temp.csv, replace

* Export firstname to genderize
preserve
keep first_name_clean
sort first_name_clean
duplicates drop first_name_clean, force
export delimited using $dir/temp/g_attorney_clean_firstname.csv, replace
restore




*** Part 2: merges attorneys' gender
import delimited $dir/temp/g_attorney_firstname_gendered_r3.csv, clear
replace gender = "ambiguous" if gender == ""
rename (round count probability) (gender_round gender_count gender_prob)
drop name_cleaned name

merge 1:m first_name_clean using $dir/temp/g_attorney_clean.dta
drop if _merge != 3
drop _merge
format %20s first_name_clean last_name_clean first_name last_name
sort patent_id attorney_name

* Different thresholds for attorney gender
gen a_gender_09_0 = gender
replace a_gender_09_0 = "ambiguous" if gender_prob<0.9
gen a_gender_09_50 = gender
replace a_gender_09_50 = "ambiguous" if gender_prob<0.9 | gender_count<50
gen a_gender_09_100 = gender
replace a_gender_09_100 = "ambiguous" if gender_prob<0.9 | gender_count<100

gen a_gender_08_0 = gender
replace a_gender_08_0 = "ambiguous" if gender_prob<0.8
gen a_gender_08_50 = gender
replace a_gender_08_50 = "ambiguous" if gender_prob<0.8 | gender_count<50
gen a_gender_08_100 = gender
replace a_gender_08_100 = "ambiguous" if gender_prob<0.8 | gender_count<100

rename (gender gender_count gender_prob) (a_gender a_gender_count a_gender_prob)
drop attorney_name_nomid first_name_clean last_name_clean first_name last_name gender_round

order patent_id attorney_name attorney_sequence a_gender_09_0 a_gender_09_50 a_gender_09_100
sort patent_id attorney_sequence

* The attorney always corresponds to the citing patent
rename patent_id patent_id_i

*** Export dataset
save $dir/temp/g_attorney_gender_temp.dta, replace





